/*
This code reads in the 941 data and reformat the PPP data matched to EINs (one observation per id-year) to produce the files used for the regression analysis of wage effects and the files used to join with the distributional files.

*/

do "${dodir}/make_globals.do"

//Import and save the 941 data

forvalues year = 2018/2021 {
	import delimited "${datadir}/f941/f941_`year'.csv", clear
	save "${datadir}/f941/f941_`year'", replace
}

clear

//Append the 941 data in to one file for all of the years

forvalues year = 2018/2021 {
	append using "${datadir}/f941/f941_`year'"
	cap gen yr = `year'
	replace yr = `year' if yr == .
}

replace state = subinstr(state, "'", "", .)

duplicates drop

//Drop probable typos
gen avg_wage = total_compensation / num_employees
bysort id: egen min_avg_wage = min(avg_wage)
drop if min_avg_wage <= 100
drop if min_avg_wage == .

//Temporary handle for duplicates
gen sane_dist = abs(5000 - avg_wage)

gsort id tax_prd sane_dist

duplicates drop id period, force

gen quarter = yq(floor(period/100), mod(period,100)/3)
format quarter %tq

*Format NAICS Codes
gen naics = substr(industry,2,6)
destring naics, replace force

replace naics = naics * 1000 if naics<1000
replace naics = naics * 100 if naics<10000
replace naics = naics * 10 if naics<100000
replace naics=. if naics<100000

* limit to valid XXX digit naics codes
tostring naics, replace
g naics_2=substr(naics,1,2)
g naics_3=substr(naics,1,3)
g naics_4=substr(naics,1,4) 
g naics_6=substr(naics,1,6) 

destring naics naics_*, replace

//Code that checks NAICS codes against year-specific valid lists (available upon request)

do "$cleaning/check_naics.do" 2 naics_2 tax_yr
do "$cleaning/check_naics.do" 3 naics_3 tax_yr
do "$cleaning/check_naics.do" 4 naics_4 tax_yr
do "$cleaning/check_naics.do" 6 naics_6 tax_yr

save "${datadir}/f941", replace

//Import and save the match data
use "${datadir}/match/all_matches", clear
rename raw_id ein


replace ein = subinstr(ein, "'", "", .)


replace id = _n if id == .

save "${datadir}/all_match", replace

//Add EINS to the full PPP data set and then make the dataset unique by id
use "${datadir}/sba/public_all", clear

merge 1:1 loannumber using "${datadir}/all_match", nogen keep(match using)

keep loannumber dateapproved processingmethod borrowername borroweraddress borrowercity borrowerstate borrowerzip /// 
initialapprovalamount currentapprovalamount undisbursedamount forgivenessamount jobsreported naicscode businesstype /// 
 firm_tin

//Drop unmatched loans
drop if firm_tin == .
rename firm_tin id

//Make unique by id and processingmethod
gsort id dateapproved
gsort id processingmethod dateapproved

duplicates drop id processingmethod, force


//Reshape the dataset so that there is one observation per id year (first and second round loans joined into one observation)
gen round = processingmethod == "PPS"

drop loannumber processingmethod borrowername borroweraddress
	
replace borrowerzip = substr(borrowerzip,1,5)
	

//Generate some time variables
gen app_date = date(dateapproved, "MDY")
format app_date %td
gen app_day = day(app_date)
gen app_month = month(app_date)
gen app_year = year(app_date)
gen app_quarter = qofd(app_date)
format app_quarter %tq

	
reshape wide dateapproved forgivenessamount initialapprovalamount currentapprovalamount undisbursedamount /// 
	jobsreported naicscode businesstype borrowerzip borrowercity borrowerstate /// 
	app_*, i(id) j(round)
	

//Differentiate between the first and second round loans
rename *0 *_first
rename *1 *_second
	
save "${datadir}/ppp_wide", replace

//Combine 941 data and PPP data for the regression analysis

use "${datadir}/f941", clear

merge m:1 id using "${datadir}/ppp_wide", nogen keep(master matched)

drop period 

order id quarter firm_state

tsset id quarter

save "${datadir}/analysis_data", replace



//Now for distributions do not drop loans that are not matched to a id to allow for scale up of effects

//Add EINS to the full PPP data set and then make the dataset unique by id
use "${datadir}/sba/public_all", clear

merge 1:1 loannumber using "${datadir}/all_match", nogen

keep loannumber dateapproved processingmethod forgivenessamount jobsreported naicscode businesstype /// 
ein id match_type

//Add a unique identifier for unmatched loans.
replace id = _n if id == .

//Collpase for the distributional side
collapse (sum) forgivenessamount jobsreported (firstnm) naicscode businesstype dateapproved loannumber match_type, by(id processingmethod)


//Reshape the dataset
gen round = processingmethod == "PPS"

gen app_date = date(dateapproved, "MDY")

format app_date %td

gen app_day = day(app_date)
gen app_month = month(app_date)
gen app_year = year(app_date)
gen app_quarter = qofd(app_date)
format app_quarter %tq

drop processingmethod

	
reshape wide loannumber dateapproved match_type forgivenessamount jobsreported naicscode businesstype /// 
	app_*, i(id) j(round)


//Differentiate between the first and second round loans	
rename *0 *_first
rename *1 *_second
	
save "${datadir}/ppp_wide_dist", replace
